/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.tools; import java.io.*; import java.net.*; import java.util.*; import java.text.*; import java.util.logging.*; import net.nutch.io.*; import net.nutch.db.*; import net.nutch.util.*; import net.nutch.pagedb.*; import net.nutch.linkdb.*; /********************************************** * This class takes an IWebDBReader, computes a relevant subset, * and then emits the subset. * * @author Mike Cafarella ***********************************************/ public class FetchListTool { public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.FetchListTool"); private static String TOP_N_SORTER = "topNSorter"; private static final long FETCH_GENERATION_DELAY_MS = 7 * 24 * 60 * 60 * 1000; File dbDir; boolean refetchOnly, anchorOptimize; float cutoffScore; int seed; /** * The TableSet class will allocate a given FetchListEntry * into one of several ArrayFiles. It chooses which * ArrayFile based on a hash of the URL's domain name. * * It uses a hash of the domain name so that pages are * allocated to a random ArrayFile, but same-host pages * go to the same file (for efficiency purposes during * fetch). * * Further, within a given file, the FetchListEntry items * appear in random order. This is so that we don't * hammer the same site over and over again during fetch. * * Each table should receive a roughly * even number of entries, but all URLs for a specific * domain name will be found in a single table. If * the dataset is weirdly skewed toward large domains, * there may be an uneven distribution. */ class TableSet { Vector outputPaths = new Vector(); Vector tables = new Vector(); long appendCounts[]; boolean hasAppended = false; /** */ public TableSet() { } /** * Add a table to the list. Cannot be called * after we start appending entries. */ public synchronized boolean add(String outputPath) throws IOException { if (hasAppended) { return false; } // // Record where the file should go. Then open a // SequenceFile.Writer to record the set of items // we append to each table. // outputPaths.add(outputPath); tables.add(new SequenceFile.Writer(outputPath + ".unsorted", MD5Hash.class, FetchListEntry.class)); return true; } /** * Add FetchListEntry items to one of the tables. * Choose the table based on a hash of the URL domain name. */ public synchronized boolean append(FetchListEntry newEntry) throws IOException { hasAppended = true; if (appendCounts == null) { appendCounts = new long[outputPaths.size()]; } Page fetchPage = newEntry.getPage(); // Extract the hostname from the URL String host = null; try { host = new URL(fetchPage.getURL().toString()).getHost().toLowerCase(); } catch (MalformedURLException e) { // ignore bad URLs return false; } // Figure out which table is getting the item MD5Hash hash = MD5Hash.digest(host); int index = Math.abs(hash.hashCode()^seed) % tables.size(); // Write it down and return SequenceFile.Writer writer = (SequenceFile.Writer) tables.elementAt(index); writer.append(fetchPage.getMD5(), newEntry); appendCounts[index]++; return true; } /** * Close down the TableSet, so there are no more FetchListEntries * expected. We now: * a) Close down all the SequenceFile.Writer objects. * b) Sort each file * c) Read each newly-sorted file and copy to an ArrayFile */ public synchronized void close() throws IOException { hasAppended = true; // A) Close all the SequenceFile.Writers for (Enumeration e = tables.elements(); e.hasMoreElements(); ) { ((SequenceFile.Writer) e.nextElement()).close(); } // B) Sort the edit-files SequenceFile.Sorter sorter = new SequenceFile.Sorter(new MD5Hash.Comparator(), FetchListEntry.class); // // Iterate through each unsorted file. Sort it (while // measuring the time taken) and upon completion delete // the unsorted version. // long totalEntries = 0; double totalTime = 0; int i = 0; for (Enumeration e = outputPaths.elements(); e.hasMoreElements(); i++) { String name = (String) e.nextElement(); String unsortedName = name + ".unsorted"; long localStart = System.currentTimeMillis(); sorter.sort(unsortedName, name + ".sorted"); long localEnd = System.currentTimeMillis(); if (appendCounts != null) { double localSecs = ((localEnd - localStart) / 1000.0); LOG.info("Processing " + unsortedName + ": Sorted " + appendCounts[i] + " entries in " + localSecs + " seconds."); LOG.info("Processing " + unsortedName + ": Sorted " + (localSecs / appendCounts[i]) + " entries/second"); totalEntries += appendCounts[i]; totalTime += localSecs; } new File(name + ".unsorted").delete(); } LOG.info("Overall processing: Sorted " + totalEntries + " entries in " + totalTime + " seconds."); LOG.info("Overall processing: Sorted " + (totalTime / totalEntries) + " entries/second"); // C) Read in each newly-sorted file. Copy to an ArrayFile. for (Enumeration e = outputPaths.elements(); e.hasMoreElements(); ) { String name = (String) e.nextElement(); SequenceFile.Reader reader = new SequenceFile.Reader(name + ".sorted"); ArrayFile.Writer af = new ArrayFile.Writer(name, FetchListEntry.class); try { MD5Hash key = new MD5Hash(); FetchListEntry fle = new FetchListEntry(); while (reader.next(key, fle)) { af.append(fle); } } finally { af.close(); reader.close(); new File(name + ".sorted").delete(); } } } } /************************************* * SortableScore is just a WritableComparable Float! *************************************/ public static class SortableScore implements WritableComparable { float score; /** */ public SortableScore() { } /** */ public void set(float score) { this.score = score; } /** */ public float getFloat() { return score; } //////// // WritableComparable //////// /** * Sort them in descending order! */ public int compareTo(Object o) { SortableScore otherScore = (SortableScore) o; if (score < otherScore.score) { return 1; } else if (score == otherScore.score) { return 0; } else { return -1; } } /** */ public void write(DataOutput out) throws IOException { out.writeFloat(score); } /** */ public void readFields(DataInput in) throws IOException { this.score = in.readFloat(); } } /** * FetchListTool takes a page db, and emits a RECNO-based * subset of it. */ public FetchListTool(File dbDir, boolean refetchOnly, boolean anchorOptimize, float cutoffScore, int seed) throws IOException, FileNotFoundException { this.dbDir = dbDir; this.refetchOnly = refetchOnly; this.anchorOptimize = anchorOptimize; this.cutoffScore = cutoffScore; this.seed = seed; } /** * Spit out several fetchlists, so that we can fetch across * several machines. */ public void emitMultipleLists(File dir, int numLists, long topN, long curTime) throws IOException { // // Create tables (and directories) for each fetchlist we want. // Add them all to a TableSet object. // TableSet tables = new TableSet(); try { String datePrefix = getDate(); File workingDir = new File(dir, "tmp_" + getDate()); workingDir.mkdirs(); try { for (int i = 0; i < numLists; i++) { File subdir = new File(dir, datePrefix + "-" + i); subdir.mkdir(); File file = new File(subdir, FetchListEntry.DIR_NAME); tables.add(file.getPath()); } // Now go through the fetchlist. emitFetchList(tables, workingDir, topN, curTime); } finally { FileUtil.fullyDelete(workingDir); } } finally { tables.close(); } } /** * Spit out the fetchlist, to a BDB at the indicated filename. */ public void emitFetchList(File segmentDir, long topN, long curTime) throws IOException { TableSet tables = new TableSet(); File workingDir = new File(segmentDir, "tmp_" + getDate()); workingDir.mkdirs(); File subdir = new File(segmentDir, getDate()); subdir.mkdir(); try { tables.add(new File(subdir, FetchListEntry.DIR_NAME).getPath()); try { emitFetchList(tables, workingDir, topN, curTime); } finally { tables.close(); } } finally { FileUtil.fullyDelete(workingDir); } } private static String getDate() { return new SimpleDateFormat("yyyyMMddHHmmss").format (new Date(System.currentTimeMillis())); } /** * Emit the fetchlist, with the given TableSet. The TableSet is * responsible for actually appending the item to the output file, * which is from this function. */ void emitFetchList(TableSet tables, File workingDir, long topN, long curTime) throws IOException { // Iterate through all the Pages, by URL. Iterating // through by URL means we can save disk seeks when // calling webdb.getLinks(URL). // // However, we don't really want the output to be in URL-ordered // format. We would like the output to be URL-randomized, which // an MD5-ordering preserves nicely. But we assume here that // TableSet will do that randomizing for us. We just need to // make sure we give it a good sampling of our data. (That is, // if we are giving TableSet fewer than the max-possible items, // we should make sure the items come evenly from all over the // db.) // long count = 0; TreeMap anchorTable = new TreeMap(); Vector unknownDomainLinks = new Vector(); // // Create a comparator that matches the domainIDs for // Link objects. // Comparator domainComparator = new Comparator() { public int compare(Object o1, Object o2) { Link l1 = (Link) o1; Link l2 = (Link) o2; if (l1.getDomainID() < l2.getDomainID()) { return -1; } else if (l1.getDomainID() == l2.getDomainID()) { return 0; } else { return 1; } } }; // // Go through all the pages by URL. Filter the ones // we really don't want, and save the others for possible // emission. // SortableScore curScore = new SortableScore(); File unsortedFile = new File(workingDir, TOP_N_SORTER + ".unsorted"); SequenceFile.Writer writer = new SequenceFile.Writer(unsortedFile.getPath(), SortableScore.class, FetchListEntry.class); try { IWebDBReader webdb = new WebDBReader(dbDir); try { for (Enumeration e = webdb.pages(); e.hasMoreElements(); count++) { // Grab the next Page. Page page = (Page) e.nextElement(); boolean shouldFetch = true; if (((count % 50000) == 0) && (count != 0)) { LOG.info("Processing page " + count + "..."); } // // Don't emit it if the Page's score doesn't meet // our cutoff value // if ((cutoffScore >= 0) && (page.getScore() < cutoffScore)) { continue; } // // If the item is not yet ready to be fetched, move on. // // Also, if getNextFetchTime is set to Long.MAX_VALUE, // then it should never be fetched. // if (page.getNextFetchTime() > curTime || page.getNextFetchTime() == Long.MAX_VALUE) { continue; } // // If we're in refetchOnly mode, set shouldFetch to FALSE // for any Pages whose URL's MD5 is the same as the // listed MD5. That indicates that no content has been // downloaded in the past. // if (refetchOnly) { MD5Hash urlHash = MD5Hash.digest(page.getURL()); if (page.getMD5().equals(urlHash)) { shouldFetch = false; } } // // If anchorOptimize mode is on, AND shouldFetch is // false, then we might apply a further optimization. // Since a non-fetched Page (that is, a URL-only // item) can only be discovered via the incoming // anchor text, we can skip those Pages that have // only *empty* incoming anchor text. // Link inlinks[] = webdb.getLinks(page.getURL()); if ((! shouldFetch) && anchorOptimize) { boolean foundUsefulAnchor = false; for (int i = 0; i < inlinks.length; i++) { UTF8 anchorText = inlinks[i].getAnchorText(); if ((anchorText != null) && (anchorText.toString().trim().length() > 0)) { foundUsefulAnchor = true; break; } } if (! foundUsefulAnchor) { continue; } } // // Uniquify identical anchor text strings by source // domain. If the anchor text is identical, and // the domains are identical, then the anchor should // only be included once. // // Links will arrive in the array sorted first by URL, // and then by source-MD5. // int uniqueAnchors = 0; for (int i = 0; i < inlinks.length; i++) { String anchor = inlinks[i].getAnchorText().toString().trim(); if (anchor.length() > 0) { if (inlinks[i].getDomainID() == 0) { unknownDomainLinks.add(anchor); } else { Set domainUniqueLinks = (Set) anchorTable.get(anchor); if (domainUniqueLinks == null) { domainUniqueLinks = new TreeSet(domainComparator); anchorTable.put(anchor, domainUniqueLinks); } if (domainUniqueLinks.add(inlinks[i])) { uniqueAnchors++; } } } } // // Finally, collect the incoming anchor text for // the current URL. Step one is to add the incoming // anchors whose links' source-domains are unknown. // (The target, obviously, the URL we're currently // processing) // int i = 0; String results[] = new String[uniqueAnchors + unknownDomainLinks.size()]; for (Enumeration e2 = unknownDomainLinks.elements(); e2.hasMoreElements(); i++) { results[i] = (String) e2.nextElement(); } unknownDomainLinks.clear(); // // Step 2, add the anchors that have actually been // uniquified by source-domain. // for (Iterator it = anchorTable.keySet().iterator(); it.hasNext(); ) { String key = (String) it.next(); Set domainUniqueLinks = (Set) anchorTable.get(key); for (int j = 0; j < domainUniqueLinks.size(); j++) { results[i++] = key; } } anchorTable.clear(); // // Last, add the FetchListEntry to a file so we can // sort by score. Be sure to modify the Page's // fetchtime; this allows us to soon generate // another fetchlist which would not include this // Page. That's helpful because with two distinct // fetchlists, it should be possible to fetch and // perform dbupdate at the same time. // curScore.set(page.getScore()); page.setNextFetchTime(page.getNextFetchTime() + FETCH_GENERATION_DELAY_MS); writer.append(curScore, new FetchListEntry(shouldFetch, page, results)); } } finally { webdb.close(); } } finally { writer.close(); } // // The next step is to sort the file we created above. // after being sorted, we add the "topN" items to the // TableSet. // File sortedFile = new File(workingDir, TOP_N_SORTER + ".sorted"); SequenceFile.Sorter topNSorter = new SequenceFile.Sorter(SortableScore.class, FetchListEntry.class); topNSorter.sort(unsortedFile.getPath(), sortedFile.getPath()); // // Last of all, add the topN items to the table set. // // This is also where we rewrite the WebDB - we need to do // this so we can modify the "date" field. Rewriting the // db can be expensive, but it's that modification that will // allow us to interleave fetching and db-update. // WebDBWriter dbwriter = new WebDBWriter(dbDir); try { SequenceFile.Reader reader = new SequenceFile.Reader(sortedFile.getPath()); try { SortableScore key = new SortableScore(); FetchListEntry value = new FetchListEntry(); while (topN > 0 && reader.next(key, value)) { tables.append(value); topN--; // // Modify the Page in the webdb so that its date // is set forward a week. This way, we can have // generate two consecutive different fetchlists // without an intervening update. So, we generate // lists A and B, and start fetching A. Upon // completion, we use A to update the db, and start // fetching B. This way we have simultaneous // dbupdate and page fetch, which should double // our throughput. // dbwriter.addPage(value.getPage()); } } finally { reader.close(); } } finally { dbwriter.close(); } } /** * Generate a fetchlist from the pagedb and linkdb */ public static void main(String argv[]) throws IOException, FileNotFoundException { if (argv.length < 2) { System.out.println("Usage: FetchListTool <db_dir> <segment_dir> [-refetchonly] [-anchoroptimize linkdb] [-topN N] [-cutoff cutoffscore] [-numFetchers numFetchers] [-adddays numDays]"); return; } // // Required args // File dbDir = new File(argv[0]); File segmentDir = new File(argv[1]); long curTime = System.currentTimeMillis(); // // Optional args // boolean refetchOnly = false, anchorOptimize = false; long topN = Long.MAX_VALUE; float cutoffScore = -1.0f; int numFetchers = 1; int seed = new Random().nextInt(); try { for (int i = 2; i < argv.length; i++) { if ("-refetchonly".equals(argv[i])) { refetchOnly = true; } else if ("-anchoroptimize".equals(argv[i])) { anchorOptimize = true; } else if ("-topN".equals(argv[i])) { if (i+1 < argv.length) { topN = Long.parseLong(argv[i+1]); i++; } else { System.out.println("No argument present for -topN"); return; } } else if ("-cutoff".equals(argv[i])) { if (i+1 < argv.length) { cutoffScore = Float.parseFloat(argv[i+1]); i++; } else { System.out.println("No argument present for -cutoffscore"); return; } } else if ("-numFetchers".equals(argv[i])) { if (i+1 < argv.length) { numFetchers = Integer.parseInt(argv[i+1]); i++; } else { System.out.println("No argument present for -numFetchers"); return; } } else if ("-adddays".equals(argv[i])) { if (i+1 < argv.length) { long numDays = Integer.parseInt(argv[i+1]); curTime += numDays * 1000L * 60 * 60 * 24; } else { System.out.println("No argument present for -adddays"); return; } } } } catch (NumberFormatException nfe) { System.out.println("Badly-formatted number:: " + nfe); return; } // // Check that args are consistent // if (anchorOptimize && !refetchOnly) { System.out.println("Tool cannot use -anchoroptimize option without -refetchonly option as well."); return; } // // Finally, start things up. // LOG.info("FetchListTool started"); if (topN != Long.MAX_VALUE) { LOG.info("topN:" + topN); } if (cutoffScore >= 0) { LOG.info("cutoffscore:" + cutoffScore); } if (numFetchers > 1) { LOG.info("seed:" + seed); } FetchListTool flt = new FetchListTool(dbDir, refetchOnly, anchorOptimize, cutoffScore, seed); if (numFetchers > 1) { flt.emitMultipleLists(segmentDir, numFetchers, topN, curTime); } else { flt.emitFetchList(segmentDir, topN, curTime); } LOG.info("FetchListTool completed"); } }